# -*- coding: utf-8 -*-
# author: Xianlong Dai
# version 13.0 ultra for Basic Algorithm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import os
import time

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

import keras
from keras import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.utils import to_categorical

from keras import regularizers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
import xgboost as xgb 
import merge_report as mdrw
import result_report as rr


param_2 = {  
    'objective': 'binary:logistic',  
    'eval_metric': 'logloss',  
    'eta': 0.1,  
    'max_depth': 3,  
    'min_child_weight': 1,  
    'subsample': 0.8,  
    'colsample_bytree': 0.8,  
    'gamma': 0,  
    'n_estimators': 100,  
    'seed': 42  
} 

import tensorflow as tf
from keras import backend as K
 
def focal_loss(gamma=2., alpha=0.25):
    def focal_loss_fixed(y_true, y_pred):
        pt_1 = tf.where(y_true == 1, y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(y_true != 1, y_pred, tf.zeros_like(y_pred))
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) \
               -K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))
 
    return focal_loss_fixed

def prepare_features_and_labels(training, testing):
    tr_f = training.drop(['label'], axis=1)
    tr_l = training[['label']].values
    te_f = testing.drop(['label'], axis=1)
    test_label = testing[['label']].values
    return tr_f, tr_l, te_f, test_label


def add_relative_features(df):
    return df


def normalise_df3(df, features=None):
    from sklearn.preprocessing import StandardScaler
    import pandas as pd
    import numpy as np
    if features is None:
        features = df.select_dtypes(include=[np.number]).columns.tolist()
    else:
        for feature in features:
            if feature not in df.columns:
                raise ValueError(f"Column '{feature}' not found in dataframe")
            if not pd.api.types.is_numeric_dtype(df[feature]):
                raise ValueError(f"Column '{feature}' is not numeric")
    scaler = StandardScaler()
    df[features] = scaler.fit_transform(df[features])
    return df



def do_some_tricks(df):
    print(df.shape[1])
    df = df.drop(range(23, df.shape[1]), axis = 1)
    return df

# 基于交叉验证（CV）的递归式特征消除（RFE），是一种包裹式（wrapper）的特征选取方法，学习器使用LGBM分类器。
def get_suitable_features(features, labels):
    lgb = LGBMClassifier(n_estimators=100, max_depth=3)

    rfecv = RFECV(estimator=lgb, step=1, cv=StratifiedKFold(3),
                  scoring='accuracy', verbose=1)

    rfecv.fit(np.array(features), np.array(labels))

    print("Optimal number of features : %d" % rfecv.n_features_)
    print("Ranking of features : %s" % rfecv.ranking_)

    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.cv_results_['mean_test_score']) + 1), rfecv.cv_results_['mean_test_score'])
    plt.show()

    return rfecv

def create_blender(X, Y):
    blender = Sequential()

    blender.add(Dense(512, activation='elu', input_dim=len(X[0]),
                      kernel_regularizer=regularizers.l2(0.001)))
    blender.add(BatchNormalization())
    blender.add(Dropout(0.3))

    blender.add(Dense(256, activation='elu', kernel_regularizer=regularizers.l2(0.001)))
    blender.add(BatchNormalization())
    blender.add(Dropout(0.3))

    blender.add(Dense(128, activation='elu', kernel_regularizer=regularizers.l2(0.001)))
    blender.add(BatchNormalization())
    blender.add(Dropout(0.3))

    blender.add(Dense(32, activation='elu', kernel_regularizer=regularizers.l2(0.001)))
    blender.add(BatchNormalization())
    blender.add(Dropout(0.3))

    blender.add(Dense(len(Y[0]), activation='softmax'))

    adam = Adam(lr=0.001)
    blender.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return blender




class BlenderClassifier(object):
    
    def __init__(self, learners, create_blender_func, n_folds=5, max_epoch=100):
        self.learners = learners
        self.n_folds = n_folds
        self.max_epoch = max_epoch
        self.history = {}
        self.create_blender = create_blender_func


    def fit(self, X, Y, test_X, test_Y, filepath, prefix_str, batch_size=128, verbose=True):
        timenow1 = time.strftime("%Y%m%d", time.localtime(time.time()))
        kfolds = StratifiedKFold(self.n_folds, shuffle=True)
        predictions = [[] for _ in self.learners]
        targets = []
        XX = []
        for fold_idx, (t, v) in enumerate(kfolds.split(X, Y)):
            if verbose:
                print()
                print('\n !!!!  ***@@@####  commencing fold {}/{}...'.format(fold_idx + 1,
                                                        self.n_folds))
                print("*****************************************")
            targets.extend(Y[v])
            print(self.learners)
            print(type(self.learners))
            count1 = 2
            for l_idx, learner in enumerate(self.learners):
                
                if verbose:
                    print('    fitting {}...'.format(type(learner)))
                count1 = count1 + 1
                idx = str(learner).find('(')
                if idx != -1:
                    new_s = (str(learner))[:idx]
                print(new_s)
                learner.fit(X[t], Y[t])
                joblib.dump(learner, "{}_{}_{}.pkl".format(str(prefix_str), str(new_s), count1))
                
                if verbose:
                    print('      predicting {}...'.format(type(learner)))
                predictions[l_idx].extend(learner.predict(X[v]))

            XX.extend(X[v])
            del t, v
            gc.collect()

            

        if verbose:
            print('  creating blender...')

        targets = to_categorical(targets)
        predictions = np.swapaxes(predictions, 0, -1)
        x_new = np.concatenate([XX, predictions], axis=-1)

        self.blender = create_blender(x_new, targets)

        del XX
        gc.collect()


        if verbose:
            print('  fitting blender...')

        early_stop = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min')
        reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, mode='min')
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        
       
        class_weights = compute_class_weight(class_weight='balanced', classes = np.unique(np.argmax(targets, axis=1)), y = np.argmax(targets, axis=1))
        class_weights_dict1 = {}
        
        class_weights_dict = {label: weight for label, weight in zip(np.unique(np.argmax(targets, axis=1)), class_weights)}
        class_weights_dict1 = {0: 0.0, **class_weights_dict}
       
        print(class_weights_dict1)
        
        self.blender.fit(x_new, targets, validation_split=0.2, epochs=self.max_epoch, callbacks=[early_stop, reduce_lr, checkpoint], class_weight=class_weights_dict1)
       
        self.blender.save(filepath)

        if verbose:
            print('done!')

    def predict(self, X, filepath, verbose=True):
        if verbose:
            print('[*] Predicting...')
        
        predictions = [[] for _ in self.learners]
        for l_idx, learner in enumerate(self.learners):
            if verbose:
                print('  predicting using {}...'.format(type(learner)))
            predictions[l_idx].extend(learner.predict(X))

        predictions = np.swapaxes(predictions, 0, -1)

        if verbose:
            print('  blending predictions...')

        x_new = np.concatenate([X, predictions], axis=-1)

        from keras.models import load_model
        self.blender = load_model(filepath)
        return self.blender.predict(x_new)

def begin_fit(features, labels, test_features, test_labels, filepath, prefix_str):
    print('[*] Begin fit...')
    
    rfc1 = RandomForestClassifier(n_estimators=100,
                                  min_samples_split=2,
                                  class_weight='balanced',
                                  criterion='entropy',
                                  n_jobs=-1)

    rfc2 = RandomForestClassifier(n_estimators=100,
                                  min_samples_split=3,
                                  class_weight='balanced',
                                  criterion='entropy',
                                  n_jobs=-1)

    rfc3 = RandomForestClassifier(n_estimators=100,
                                  min_samples_split=5,
                                  class_weight='balanced',
                                  criterion='entropy',
                                  n_jobs=-1)

    # 极端随机树
    etc1 = ExtraTreesClassifier(n_estimators=100,
                                min_samples_split=2,
                                class_weight='balanced',
                                criterion='entropy',
                                n_jobs=-1)

    etc2 = ExtraTreesClassifier(n_estimators=100,
                                min_samples_split=3,
                                class_weight='balanced',
                                criterion='entropy',
                                n_jobs=-1)

    etc3 = ExtraTreesClassifier(n_estimators=100,
                                min_samples_split=5,
                                class_weight='balanced',
                                criterion='entropy',
                                n_jobs=-1)

    # LightGBM 基于Histogram的决策树算法
    lbc1 = LGBMClassifier(n_estimators=500,
                         learning_rate=0.001)

    lbc2 = LGBMClassifier(n_estimators=500,
                         learning_rate=0.01)

    lbc3 = LGBMClassifier(n_estimators=500,
                         learning_rate=0.1)
    
   
    c45_4 = DecisionTreeClassifier(criterion='entropy', splitter='best', max_depth=100, min_samples_leaf=2, max_features='auto')

    xgb1 = xgb.XGBClassifier(**param_2)

    default_learners = [rfc1, rfc2, rfc3,
                        etc1, etc2, etc3,
                        lbc1, lbc2, lbc3,
                        c45_4, xgb1]

                        
    Xt = np.array(features)
    Yt = np.array(labels)
    Xt12 = np.array(test_features)
    Yt12 = np.array(test_labels)

    bc = BlenderClassifier(learners=default_learners,
                           create_blender_func=create_blender)


    bc.fit(Xt, Yt, Xt12, Yt12, filepath, prefix_str)
    y_pred = bc.predict(Xt12, filepath)

    y_pred = np.argmax(y_pred, axis=-1)
    y_pred = to_categorical(y_pred)
    Yt12 = to_categorical(Yt12)
    score = accuracy_score(Yt12, y_pred)

    print(" ")
    print('!! ==========******final validation score: {}'.format(score))
    print(" ")
    

def begin_predict2(features, filepath, prefix_str, save_csv_name):

   
    rfc1 = joblib.load('{}_RandomForestClassifier_3.pkl'.format(str(prefix_str)))
    rfc2 = joblib.load('{}_RandomForestClassifier_4.pkl'.format(str(prefix_str)))
    rfc3 = joblib.load('{}_RandomForestClassifier_5.pkl'.format(str(prefix_str)))
    etc1 = joblib.load('{}_ExtraTreesClassifier_6.pkl'.format(str(prefix_str)))
    etc2 = joblib.load('{}_ExtraTreesClassifier_7.pkl'.format(str(prefix_str)))
    etc3 = joblib.load('{}_ExtraTreesClassifier_8.pkl'.format(str(prefix_str)))
    lbc1 = joblib.load('{}_LGBMClassifier_9.pkl'.format(str(prefix_str)))
    lbc2 = joblib.load('{}_LGBMClassifier_10.pkl'.format(str(prefix_str)))
    lbc3 = joblib.load('{}_LGBMClassifier_11.pkl'.format(str(prefix_str)))
    c45_clf = joblib.load('{}_DecisionTreeClassifier_12.pkl'.format(str(prefix_str)))
    xgb1 = joblib.load('{}_XGBClassifier_13.pkl'.format(str(prefix_str)))

    default_learners = [rfc1, rfc2, rfc3,
                    etc1, etc2, etc3,
                    lbc1, lbc2, lbc3,
                    c45_clf, xgb1]
    
    classifier = BlenderClassifier(learners=default_learners,
                           create_blender_func=create_blender)

    print('[*] Begin predict for test data...')
    y_pred = classifier.predict(features, filepath)

    y_pred = np.argmax(y_pred, axis=-1)

    print('[*] Save to CSV...')
    sub = pd.DataFrame()
    sub['label'] = y_pred
    sub.to_csv(save_csv_name, index=False)
    


def main():
    prefix_str = ''

    tr_csv = f'./{prefix_str}_train.csv'
   
    ts_csv = f'./{prefix_str}_test.csv'
    ts_csv = f'./{prefix_str}_op.csv'
    ts_csv = f'./{prefix_str}_cernet.csv'



    
    save_csv_name = f'{ts_csv[2:-4]}_pre.csv'
    filepath = f'{prefix_str}.h5'


    if ('op' in ts_csv) or ('cernet' in ts_csv) or ('mj' in ts_csv) or ('mirror' in ts_csv):
        train_switch = 0
    else:
        train_switch = 1
    
    print('        @#$ train_switch:  ' + str(train_switch))

    training_data = pd.read_csv(tr_csv)
    testing_data = pd.read_csv(ts_csv)

    print(training_data.describe())
    print(testing_data.describe())

    
    training_features, training_labels, testing_features, test_labels = \
        prepare_features_and_labels(training_data, testing_data)

    
    all_features = pd.concat([training_features, testing_features], axis=0)
    print(all_features.describe())
    

    training_features = np.array(all_features[:len(training_data)])
    testing_features = np.array(all_features[len(training_data):])


    if train_switch == 1:
        begin_fit(training_features, training_labels, testing_features, test_labels, filepath, prefix_str)
    
    begin_predict2(testing_features, filepath, prefix_str, save_csv_name)

    y_true11, y_pred11 = mdrw.merge1(ts_csv, save_csv_name)
    
    rr.report(y_true11, y_pred11, ['NoTe', 'Te'], cmp="Blues")
    



if __name__ == '__main__':
    main()